감성분석


In [ ]:
import pandas as pd
import numpy as np

sentiment_data = pd.read_csv("naver_ratings.txt",sep='\t')

In [ ]:
# 네이버 평점 기준 label == 1(긍정) 0(부정)
sentiment_data.head()

In [ ]:
# select random sample data
def select_reviews(data, num,label):
    label_data = data[data.label==label].reset_index()
    index = np.random.randint(100000,size=num)
    return label_data.ix[index]

In [ ]:
# postive and negative data random sampling
positive_data = select_reviews(sentiment_data,100,1)
negative_data = select_reviews(sentiment_data,100,0)

In [ ]:
# head of sample positive data
positive_data.head()

In [ ]:
# head of sample negative data
negative_data.head()

In [ ]:
# documents merging

def doc_merge(data):
    merged_docs = []
    
    for doc in data["document"]:
            merged_docs.append(doc)

    return merged_docs

In [ ]:
positive_merged_doc = doc_merge(positive_data)
negative_merged_doc = doc_merge(negative_data)

In [ ]:
positive_merged_doc[0]

형용사 = { 관형사, 부사}로 생각하고 kkma tag 기준 M으로 시작하는 tag만 추출 했습니다


In [ ]:
from konlpy.tag import Kkma

kkma = Kkma()
positive_pos = []

for doc in positive_merged_doc:
    for pos in kkma.pos(doc):
        if pos[1][0]=='M':
            positive_pos.append(pos[0])
            
negative_pos = []

for doc in negative_merged_doc:
    for pos in kkma.pos(doc):
        if pos[1][0]=='M':
            negative_pos.append(pos[0])

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfTransformer
wordcloud = WordCloud(font_path = r'C:\Windows\Fonts\Daum_SemiBold.ttf')

vectorizer = CountVectorizer(min_df=1)
pos_bow = vectorizer.fit_transform(positive_pos)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(pos_bow.toarray())

word_tf = zip(vectorizer.get_feature_names(),pos_bow.toarray()[1])

긍정 리뷰에 대한 wordcloud


In [ ]:
wordcloud.generate_from_frequencies(word_tf).to_image()

In [ ]:
vectorizer = CountVectorizer(min_df=1)
neg_bow = vectorizer.fit_transform(negative_pos)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(neg_bow.toarray())

word_tf = zip(vectorizer.get_feature_names(),neg_bow.toarray()[1])

부정 리뷰에 대한 wordcloud


In [ ]:
wordcloud.generate_from_frequencies(word_tf).to_image()

리뷰 긍부정 분류


In [ ]:
#positive_data2 = select_reviews(sentiment_data,1000,1)
#negative_data2 = select_reviews(sentiment_data,1000,0)

positive_data.head()
positive_pos

In [ ]:
# data merge
total_data = pd.concat((positive_data,negative_data))

In [ ]:
print(len(positive_data), len(negative_data), len(total_data))

In [ ]:
X = total_data['document']
y = total_data['label']

In [ ]:
# bow matrix generate & training_test split
from sklearn.cross_validation import train_test_split

vectorizer = CountVectorizer(min_df=1)
bow = vectorizer.fit_transform(X)
X_trn, X_tst, y_trn, y_tst = train_test_split(bow, y,test_size = 0.3)

model 학습


In [ ]:
from sklearn import linear_model

model = linear_model.LogisticRegression(penalty='l2')
model.fit(X_trn,y_trn)

In [ ]:
y_pred = model.predict(X_tst)

성능 평가 - accuracy


In [ ]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_tst,y_pred)

In [ ]:
acc = (conf_mat[0,0] + conf_mat[1,1])/sum(sum(conf_mat))
print("accuracy = %f"%acc)

test - 긍정1, 부정1 review에 대해


In [ ]:
test_sentiment = ['꼭 보세요 강추합니다 한번 더 보고 싶은 영화에요', '내가 이걸 왜 봤는지 모르겠다. 사전에 검색좀 해보고 볼걸 아.. 짜증나']

In [ ]:
# 기존 vocabulary를 가지고 새로운 bow matrix 만들기
vectorizer2 = CountVectorizer(min_df=1,vocabulary = vectorizer.vocabulary_)

In [ ]:
new_input = vectorizer2.fit_transform(test_sentiment)
print(new_input.get_shape())

In [ ]:
model.predict_proba(new_input.toarray())